## Make output directory
outdir <- "~/multiOmic_benchmark/report/output/20191127_tcellTrajectory/"
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
[1] FALSE
Based on the results of my benchmark, I set out to align expression and accessibility profiles from the F74 developing thymus dataset to detect changes in accessibility along pseudotime trajectories. While the benchmark was based on the task of label propagation, I here use the two most faithful methods (Seurat CCA and Conos) to achieve a common embedding of ATAC-seq and RNA-seq cells.
Load datasets.
rna.sce <- readRDS("~/my_data/F74_RNA_seurat_processed.RDS")
atac.sce <- readRDS("~/my_data/F74_ATAC_snapAtac_processed_bgmat.RDS")
## Re-normalize RNA data
seu.rna <- as.Seurat(rna.sce, counts = "counts")
seu.rna <- NormalizeData(seu.rna)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
logcounts(rna.sce) <- seu.rna@assays$RNA@data
Filter genes with zero variance
rna.gene.var <- as.matrix(counts(rna.sce)) %>% rowVars()
atac.gene.var <- as.matrix(counts(atac.sce)) %>% rowVars()
rna.sce <- rna.sce[which(rna.gene.var > 0),]
atac.sce <- atac.sce[which(atac.gene.var > 0),]
rna.sce; atac.sce
class: SingleCellExperiment
dim: 24510 8321
metadata(0):
assays(3): counts cpm logcounts
rownames(24510): RP11-34P13.3 RP11-34P13.7 ... AC233755.1 AC240274.1
rowData names(0):
colnames(8321): AAACCTGAGTTCGATC_1 AAACCTGCAAGTTGTC_1 ... TTTGTCAAGCTGAACG_2 TTTGTCAGTATTAGCC_2
colData names(1): annotation
reducedDimNames(0):
spikeNames(0):
class: SingleCellExperiment
dim: 31122 5793
metadata(0):
assays(3): counts cpm logcounts
rownames(31122): A1BG A1BG-AS1 ... ZYX ZZEF1
rowData names(0):
colnames(5793): AAACGAAAGTGAACCG-1 AAACGAACATCGGCCA-1 ... TTTGTGTTCGATCGCG-1 TTTGTGTTCTGAGTAC-1
colData names(28): orig.ident nCount_ATAC ... nFeature_ACTIVITY ident
reducedDimNames(2): LSI UMAP
spikeNames(0):
I re-run the integration based on the T cell subset. To select cells from the scATAC dataset, I take the SnapATAC clusters that best correspond to T-cells, based on label transfer.
tcells.sce.atac <- atac.sce[,which(as.numeric(atac.sce$seurat_clusters) %in% c(1:9))]
tcells.rna.ix <- which(rna.sce$annotation %in% c("DN","DP (Q)", "DP (P)", "SP (1)", "SP (2)"))
tcells.sce.rna <- rna.sce[,tcells.rna.ix]
tcells.sce.list <- list(RNA=tcells.sce.rna, ATAC=tcells.sce.atac)
## Make color palette 4 cell types
cell.types <- as.character(unique(tcells.sce.rna$annotation))
cell.type.pal <- brewer.pal(length(cell.types), "Set1") %>% rev() %>% setNames(cell.types)
Next, I select genes on which to perform integration. I take the union of the most variable features in the RNA dataset and the most covered features in the ATAC dataset
hcg.atac <- select_highlyCovered(tcells.sce.list$ATAC, frac_cells = 0.2)
hvg.rna <- select_highlyVariable(tcells.sce.list$RNA)
Calculating gene means
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variance to mean ratios
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
seu.rna <- FindVariableFeatures(seu.rna, nfeatures = 2000,
# selection.method = "mvp", dispersion.cutoff=c(0.7, 100), mean.cutoff=c(0.02, 3)
)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
hvg.rna <- VariableFeatures(seu.rna)
VariableFeaturePlot(seu.rna)
UpSetR::upset(UpSetR::fromList(list(HVG.RNA=hvg.rna, HCG.ATAC=hcg.atac)))
Remove cell cycle genes, that might interfere with pseudotime ordering
cell_cycle_genes <- read.table("~/annotations/cell_cycle_genes.tsv")$V1
integrate_features_union <- union(hvg.rna, hcg.atac)
integrate_features_union <- setdiff(integrate_features_union, cell_cycle_genes)
## Select features in both datasets
integrate_features_union <- intersect(integrate_features_union, intersect(rownames(tcells.sce.list$ATAC), rownames(tcells.sce.list$RNA)))
tcells.seu.list <- map(tcells.sce.list, ~ as.Seurat(.x))
All keys should be one or more alphanumeric characters followed by an underscore '_', setting key to LSI_All keys should be one or more alphanumeric characters followed by an underscore '_', setting key to UMAP_
tcells.RNA.union <- tcells.seu.list$RNA
VariableFeatures(tcells.RNA.union) <- integrate_features_union
tcells.RNA.union <- ScaleData(tcells.RNA.union) %>% RunPCA() %>% RunUMAP(dims=1:40)
Centering and scaling data matrix
|
| | 0%
|
|====================== | 20%
|
|============================================= | 40%
|
|=================================================================== | 60%
|
|========================================================================================== | 80%
|
|================================================================================================================| 100%
The following 176 features requested have zero variance (running reduction without them): TM4SF18, MEOX2, ASGR2, DMRT2, KIR2DL4, CACNA1B, FREM2, MYZAP, TOX3, PRIMA1, PPP2R2B, PTPRZ1, CMTM5, TREM2, CALB2, TMTC1, CDH6, PKHD1L1, ISLR2, KRT7, MYH8, DCX, CCSER1, MSLN, CDH19, EYA4, P2RY12, PLA2G2D, ADAMTS16, KCNJ5, PAPPA, QRFPR, ADGRG2, NFIA-AS2, LUZP2, NALCN, FABP7, TSPEAR-AS1, GPR1, CD300LF, A4GALT, AP000439.1, CEACAM3, SLITRK2, KIR3DL1, AC147651.1, MUC15, FFAR4, C11orf53, KLK11, WIF1, CFHR1, TMEM233, FOLR3, GRID2, GALNT15, FAM19A1, TAC1, PTCHD4, CCL11, ZNF385B, GABRA1, ADH1B, LINC01048, LMOD1, TNNT1, ACTL6B, SEMA3E, HOXD-AS2, NTF4, SLC35F1, MANEAL, FEV, RAB3C, SYT9, HMCN2, ZFHX4-AS1, PCSK9, MMP12, ABCB11, AC002066.1, ADAM23, ADCY5, ADGB, ADGRB1, ANKRD29, ATP2B2, C4orf45, CCDC33, CDH18, CDK15, CERS1, CFAP161, CLRN1-AS1, CNBD1, CNGB3, CNTN5, COL8A2, CTNND2, DCC, DSCAM, EPHA6, FGF12, FSTL5, GABBR2, GABRB1, GABRG3, GALNT13, GHR, GLIS3, GNG12-AS1, GNGT1, GRIN2B, GRM1, GRM8, HMCN1, HPSE2, HS3ST5, HS6ST3, IL1RAPL2, ITGBL1, KCNB1, KCNH7, LINC00639, LINC01169, LINC01170, LINC01182, LINC01317, LINC01505, LINGO1, LINGO2, LRRC4C, LRRTM4, LY75-CD302, MAP6, MLIP, MYRIP, NEBL, NKAIN2, NKAIN3, NRG1, OPCML, PCDH15, PCDHGA4, PDZRN4, PIK3C2G, PKNOX2, PNPLA1, POU6F2, PPFIA2, PRR5-ARHGAP8, PTPN5, PTPRT, RALYL, RERG, RGS7, RIMS1, RIN2, RYR3, SHANK2, SNTG1, SORCS3, STON1-GTF2A1L, SYN2, TENM2, THSD4, TRPC6, TSPEAR, TVP23C-CDRT4, UNC80, USH1C, VAX2, VWA3B, XKR4, ZBTB7C, ZNF804BPC_ 1
Positive: SATB1, PTPRC, MTSS1, APBB1IP, LYST, SH2D1A, CAMK4, LBH, FBLN5, LEF1
PLEKHG1, VOPP1, CD247, MXD1, TCF12, ARHGEF7, ALDH1A2, NFATC3, TBC1D19, SYTL3
ADAMTS17, ANO6, DAPK1, CALN1, THEMIS, PITPNM2, NLGN4X, MAPRE2, GALNT7, ZNF280D
Negative: ENO1, GSTP1, FABP5, TMSB10, SMS, PKM, NME4, VIM, RPL37A, YWHAQ
NCL, IGFBP2, CAPG, NDUFA12, TRDC, PGK1, PARVB, LDHB, ATOX1, SELL
CDC123, NUDC, IGLL1, UBE2N, FXYD2, GMPS, C20orf27, SLC25A39, ANXA1, C12orf75
PC_ 2
Positive: RPL37A, EIF3H, LDHB, TBCA, IL32, SERGEF, SMPD3, ITGAE, AATF, FBLN5
NDUFA12, SOD1, DNAJC15, C12orf75, MRPL33, TMSB10, HNRNPC, CCDC57, SMCO4, GDI2
OLA1, ALDH1A2, COX7A2L, CST3, CYSTM1, ATOX1, GYPC, PDCD6, FABP5, RALY
Negative: MBNL1, ITPR2, PTPRC, MTRNR2L12, ADAM10, MSI2, CDK6, SELL, JCHAIN, RNF213
MME, TCF12, BPTF, BCL11B, NIPBL, MACF1, PIK3R1, HIVEP3, SOCS2, GALNT7
IKZF2, RUNX1, SPTBN1, PRRC2B, BCL2, DIAPH1, MYCBP2, SLC38A1, MBP, RUFY3
PC_ 3
Positive: HLA-B, TOX2, COTL1, CTSW, KLRB1, CD40LG, SIRPG, GZMM, CLDN1, CD74
ITM2A, GBP2, BACH2, HPGD, PDE4D, TNFRSF1B, S100A10, CLEC2D, XCL1, GFOD1
CRTAM, MATK, DENND2D, PDCD1, GIMAP4, STAT1, ZNF683, CD226, HLA-A, TNFRSF25
Negative: TFDP2, JCHAIN, ATP6AP1L, DEFA6, MME, GALNT2, PCGF5, ADGRG1, GLIPR1, MSI2
NINL, CEP70, CDK6, PTPN2, FXYD2, TRDC, GSTP1, SELL, SOCS2, RGPD3
SMPD3, FABP5, LYST, SSBP2, PITPNM2, DLEU7, UBE2E1, NUCB2, PPP1R1C, BCL11A
PC_ 4
Positive: SMPD3, C12orf75, TUBA1C, LCP1, HIST1H2AB, SMS, SMCO4, GMPS, FABP5, XPO1
IGFBP2, CCND3, TAF15, RGS3, SREBF2, YWHAQ, TMSB15A, ABCD3, NCL, PHIP
GNAS, EPB41L2, SYNE2, STAG2, CNTLN, VIM, NUP210, MCTP1, NUDC, NFATC3
Negative: JCHAIN, SELL, DEFA6, SOCS2, GLIPR1, ATP6AP1L, MME, RGPD3, XG, DPP4
IFI6, ENAM, GNAL, EVL, NEGR1, FRMPD2, EVA1A, PIK3CD, GALNT2, NDST3
MBP, RNF144A, FXYD2, COL1A2, SMIM24, NDFIP2, OXNAD1, ACTN1, LSP1, LINC00861
PC_ 5
Positive: HPGD, BACH2, TOX2, ITM2A, GZMM, ST6GAL1, ARAP2, CD96, TGFBR2, LZTFL1
VIM, EVL, CD44, SATB1, IL2, TUSC3, CYTIP, PDE4D, MAD1L1, SLC2A3
ARHGAP31, GPR183, PGK1, ANKRD44, STK4, BCL2, GNG2, IKZF1, ICOS, ETS1
Negative: XCL1, TNFRSF9, CTSW, XCL2, GBP2, TNFRSF1B, S100A4, GNG4, NPW, CD74
HOPX, IGFBP4, CD151, SH3BGRL2, NFATC1, LYST, ATP9A, NR4A2, LINC01480, CLEC2D
MIR3142HG, LINC01281, PREX1, HDAC9, PHACTR1, MAP7, CST3, CXCR3, PITPNM2, PDCD1
The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session09:11:47 UMAP embedding parameters a = 0.9922 b = 1.112
09:11:47 Read 7101 rows and found 40 numeric columns
09:11:47 Using Annoy for neighbor search, n_neighbors = 30
09:11:47 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
09:11:48 Writing NN index file to temp file /tmp/RtmpIaaTXb/file9c876abf67
09:11:48 Searching Annoy index using 1 thread, search_k = 3000
09:11:51 Annoy recall = 100%
09:11:52 Commencing smooth kNN distance calibration using 1 thread
09:11:55 Initializing from normalized Laplacian + noise
09:11:55 Commencing optimization for 500 epochs, with 314138 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
09:12:14 Optimization finished
DimPlot(tcells.RNA.union, group.by = "annotation", label=TRUE) + ggtitle("RNA - feature union")
Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
Please use `as_label()` or `as_name()` instead.
[90mThis warning is displayed once per session.[39m
Visualize markers
t.cell.markers <- list(known.markers = c("CD34", "IGLL1", "TRGC2", "TRDC", "PTCRA", "TRBC2", "TRAC", "CD4", "CD8A", "CD8B"),
chemokine.receptors = c("CCR9", "CCR7"),
tcr.activation = c("CD5", "CD27"),
proliferation=c("PCNA", "CDK1", "MKI67"),
cyclin.D = c("CCND2", "CCND3"),
recombination=c("RAG1", "RAG2"),
apoptosis=c("HRK","BMF", "TP53INP1"),
stage.markers = c("ST18", "HIVEP3", "RGPD3", "SMPD3", "AQP3", "RORC", "SATB1", "TOX2")
)
# FeaturePlot(tcells.RNA.ref, features = t.cell.markers$known.markers, cols = viridis::viridis(n=10))
FeaturePlot(tcells.RNA.union, features = t.cell.markers$known.markers, cols = viridis::viridis(n=10))
Colored by clusters called with SnapATAC
tcells.ATAC.union <- tcells.seu.list$ATAC
# tcells.ATAC.union <- NormalizeData(tcells.ATAC.union)
VariableFeatures(tcells.ATAC.union) <- integrate_features_union
tcells.ATAC.union <- RunLSI(tcells.ATAC.union, n=50, scale.max = NULL)
RunLSI is being moved to Signac. Equivalent functionality can be achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more information on Signac, please see https://github.com/timoast/SignacRunLSI is being moved to Signac. Equivalent functionality can be achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more information on Signac, please see https://github.com/timoast/SignacRunLSI is being moved to Signac. Equivalent functionality can be achieved via the Signac::RunTFIDF and Signac::RunSVD functions; for more information on Signac, please see https://github.com/timoast/SignacPerforming TF-IDF normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Running SVD on TF-IDF matrix
Scaling cell embeddings
Cannot add objects with duplicate keys (offending key: LSI_), setting key to 'lsi_'
tcells.ATAC.union <- RunUMAP(tcells.ATAC.union, reduction = "lsi", dims = 1:50)
10:10:05 UMAP embedding parameters a = 0.9922 b = 1.112
10:10:05 Read 4977 rows and found 50 numeric columns
10:10:05 Using Annoy for neighbor search, n_neighbors = 30
10:10:05 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:10:06 Writing NN index file to temp file /tmp/RtmpIaaTXb/file9c861ce1d48
10:10:06 Searching Annoy index using 1 thread, search_k = 3000
10:10:08 Annoy recall = 100%
10:10:11 Commencing smooth kNN distance calibration using 1 thread
10:10:14 Initializing from normalized Laplacian + noise
10:10:14 Commencing optimization for 500 epochs, with 172060 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
10:10:26 Optimization finished
Cannot add objects with duplicate keys (offending key: UMAP_), setting key to 'umap_'
DimPlot(tcells.ATAC.union, reduction = "umap", group.by = "seurat_clusters", label = TRUE) + ggtitle("ATAC gmat")
Makes imputed transcriptome profile for the ATAC-seq cells to allow co-embedding
sce.list <- tcells.sce.list
reference = "RNA"
query = "ATAC"
seurat.list <- imap(sce.list, ~ as.Seurat(.x, assay=.y))
seurat.list <- imap(seurat.list, ~ RenameCells(.x, add.cell.id=.y))
## Scale data
seurat.list <- map(seurat.list, ~ ScaleData(.x))
## Calculate CCA anchors
transfer.anchors <- FindTransferAnchors(reference = seurat.list[[reference]],
query = seurat.list[[query]],
features = integrate_features_union,
reduction = "cca")
## Impute expression profiles for ATAC cells (for all genes, not just integration features)
refdata <- GetAssayData(seurat.list$RNA, assay = "RNA", slot = "data")
imputation <- TransferData(anchorset = transfer.anchors, refdata = refdata, weight.reduction = seurat.list$ATAC[["LSI"]])
## Merge datasets and co-embed
seurat.list$ATAC[["RNA"]] <- imputation
coembed <- merge(x = seurat.list$RNA, y = seurat.list$ATAC)
coembed <- ScaleData(coembed, features = integrate_features_union, do.scale = FALSE)
coembed <- RunPCA(coembed, features = integrate_features_union, verbose = FALSE)
coembed <- RunUMAP(coembed, dims = 1:30)
coembed <- AddMetaData(coembed, metadata = ifelse(colnames(coembed) %in% colnames(seurat.list[[reference]]), reference, query), col.name = "tech")
celltype.predictions <- TransferData(anchorset = transfer.anchors,
refdata = seurat.list[[reference]]$annotation,
weight.reduction = seurat.list$ATAC[["LSI"]])
coembed <- AddMetaData(coembed, metadata = celltype.predictions)
coembed@meta.data %<>%
rownames_to_column() %>%
dplyr::mutate(annotation=ifelse(is.na(predicted.id) , annotation, NA)) %>%
column_to_rownames()
coembed@meta.data <-
coembed@meta.data %>%
rownames_to_column() %>%
dplyr::mutate(annotation=ifelse(is.na(annotation) & prediction.score.max > 0.5, predicted.id, annotation)) %>%
dplyr::mutate(annotation=ifelse(annotation=="SP (2)", NA, annotation)) %>%
column_to_rownames()
CombinePlots(
list(DimPlot(coembed, group.by = c("predicted.id"), cols = cell.type.pal) + ggtitle("prediction"),
DimPlot(coembed, group.by = c("annotation"), cols = cell.type.pal) + ggtitle("Original + prediction")),
legend = "top"
)
FeaturePlot(coembed, features = "prediction.score.max", cells = which(coembed$tech=="ATAC")) + scale_color_viridis_c()
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
Identify cell of origin among the DN cells based on expression of IGLL1 and CD34
FeaturePlot(coembed, features = c("IGLL1", "CD34"), split.by = "tech", slot = "data", cols = viridis::viridis(n=100))
cell.oo <-
coembed@meta.data %>%
rownames_to_column("cell") %>%
mutate(IGLL1=coembed@assays$RNA@counts["IGLL1",cell]) %>%
select(cell, annotation, IGLL1) %>%
arrange(-IGLL1) %>%
filter(annotation=="DN") %>%
top_n(1, IGLL1) %>%
pull(cell)
coembed@reductions$umap@cell.embeddings %>%
as.tibble(rownames="cell") %>%
mutate(cell.oo = ifelse(cell %in% cell.oo, T, F)) %>%
ggplot(aes(UMAP_1, UMAP_2)) +
geom_point(color="grey50") +
geom_point(data=. %>% filter(cell.oo),color='red') +
ggrepel::geom_text_repel(data=. %>% filter(cell.oo), aes(label="cell of origin"), color='red') +
theme_cowplot()
coembed <- AddMetaData(coembed, ifelse(colnames(coembed)==cell.oo, TRUE, FALSE), col.name = "iroot_cell")
merged.sce <- SingleCellExperiment(list(counts=coembed@assays$RNA@counts, logcounts=coembed@assays$RNA@data), colData=coembed@meta.data[, c("annotation", "tech", "iroot_cell")],
reducedDims = map(coembed@reductions, ~ .x@cell.embeddings))
saveRDS(object = merged.sce, "~/my_data/Tcells_CCA_integration_20191203.RDS")
saveRDS(object = integrate_features_union, "~/my_data/intFeatures_Tcells_CCA_integration_20191203.RDS")
I infer pseudotime using the diffusion pseudotime algorithm as implemented in scanpy. Making an R/reticulate wrapper for this function would be nice, but for now, see multiOmic_benchmark/DPT_tcells.ipynb.
Read scanpy output and save in R object.
dpt <- read.csv('~/my_data/Tcells_CCA_integration_20191127_scanpy_dpt.csv') %>%
select(X, dpt_pseudotime)
coembed <- AddMetaData(coembed, column_to_rownames(dpt, 'X'))
saveRDS(coembed, "~/my_data/Tcells_CCA_integration_seurat_20191203.Rmd")
coembed <- readRDS("~/my_data/Tcells_CCA_integration_seurat_20191203.Rmd")
Visualize pseudotime
FeaturePlot(coembed, reduction = "umap", feature = "dpt_pseudotime", split.by = "tech", col=viridis::viridis(10))
Save figure
coembed.umaps.pl <- plot_grid(
DimPlot(coembed, group.by = c("tech")) + theme(legend.position = "top"),
DimPlot(coembed, group.by = c("annotation"), cols = cell.type.pal, label = TRUE, label.size = 5) + theme(legend.position = "none"),
FeaturePlot(coembed, reduction = "umap", feature = "dpt_pseudotime") + scale_color_viridis_c(name="Diffusion\npseudotime") + ggtitle(""),
nrow=1, ncol=3, rel_widths = c(1,1,1.2),
labels = c("A", "B", "C")
)
Scale for 'colour' is already present. Adding another scale for 'colour', which will replace the existing scale.
coembed.umaps.pl +
ggsave(paste0(outdir, "coembed_umaps.png"), width=12, height = 4)
coembed@meta.data %>%
dplyr::mutate(`DPT rank`=dense_rank(dpt_pseudotime)) %>%
ggplot(aes(`DPT rank`)) +
geom_histogram(aes(fill=annotation), bins=50) +
facet_grid(annotation~tech, scales="free_y") +
theme_bw(base_size = 16) +
scale_fill_manual(values = cell.type.pal)
Check expression of markers along pseudotime
coembed@assays$RNA@data[t.cell.markers$known.markers, ] %>%
as.matrix() %>%
reshape2::melt(varnames=c("gene", "cell")) %>%
left_join(coembed@meta.data[,"dpt_pseudotime", drop=F] %>% rownames_to_column("cell")) %>%
mutate(pseudotime.rank=dense_rank(dpt_pseudotime)) %>%
group_by(gene) %>%
arrange(pseudotime.rank) %>%
# mutate(value=(value-min(value))/max(value)-min(value)) %>%
mutate(value=zoo::rollmean(value, k=5, fill=NA)) %>%
# mutate(value=(value-mean(value))/sd(value)) %>%
ungroup() %>%
mutate(gene=factor(gene, levels=rev(unique(gene)))) %>%
ggplot(aes(pseudotime.rank, gene, fill=value)) +
geom_tile() +
scale_fill_viridis_c(name="log expression") +
theme_bw(base_size = 16) +
theme(panel.border = element_blank(), panel.grid = element_blank())
Check accessibility of markers along pseudotime
FeaturePlot(coembed, feature=c("CD4"), slot = "data", split.by = "tech")
All cells have the same value (0) of CD4.
Bin pseudotime and visualize cell type composition
dpt.df <-
coembed@meta.data %>%
rownames_to_column("cell") %>%
dplyr::mutate(dpt_rank=dense_rank(dpt_pseudotime)) %>%
mutate(dpt_bin=cut(dpt_rank, breaks = 100)) %>%
mutate(dpt_bin=as.numeric(dpt_bin)) %>%
select(cell,tech, annotation, prediction.score.max, dpt_bin, dpt_pseudotime, dpt_rank)
cell.type.pl <- dpt.df %>%
ggplot(aes(dpt_bin, fill = annotation)) +
# geom_histogram(bins=100) +
geom_bar() +
scale_fill_manual(values=cell.type.pal, na.value="grey50") +
facet_grid(tech~., scales="free_y") +
xlab("Pseudotime bin") +
theme_bw(base_size = 16)
cell.type.pl
Correlation between global accessibility and pseudotime ordering.
coembed.umaps.pl <- plot_grid(
DimPlot(coembed, group.by = c("tech")) + theme_classic(base_size = 16) + theme(legend.position = "top") ,
DimPlot(coembed, group.by = c("annotation"), cols = cell.type.pal, label = TRUE, label.size = 5) + theme_classic(base_size = 16) + theme(legend.position = "none"),
FeaturePlot(coembed, reduction = "umap", feature = "dpt_pseudotime") +
theme_classic(base_size = 16) +
scale_color_viridis_c(name="Diffusion pseudotime") + ggtitle("") +
guides(color=guide_colorbar(barwidth = 10,title.position = "top")) +
theme(legend.position = "top", legend.justification = "center") ,
nrow=3, ncol=1, rel_heights = c(1.2,1,1.5)
)
plot_grid(coembed.umaps.pl, dpt.pl, rel_widths=c(1,1.9)) +
ggsave(paste0(outdir, "tcells_figure.png"), width=14, height = 10)
I initially wanted to call peaks from SnapATAC clusters, then build a cell x peak matrix on those detected peaks, but SnapATAC/MACS2 don’t seem to work.
Alternative: load peak matrix from cellranger and add to snap object
filt.peaks <- Read10X_h5("~/my_data/filtered_peak_bc_matrix.h5")
peaks.mat <- str_split(rownames(filt.peaks), pattern = ":|-") %>% map(rbind) %>% purrr::reduce(rbind)
peaks.gr <- GRanges(peaks.mat[,1], IRanges(as.numeric(peaks.mat[,2]), as.numeric(peaks.mat[,3])))
snap.pmat <- createSnapFromPmat(mat=t(filt.peaks[,snap.out@barcode]), barcodes=snap.out@barcode, peaks=peaks.gr)
snap.pmat
Calculating deviations in TF accessibility using ChromVAR. This is a measure of how much is motif accessibility in each cell is enriched compared to all the cells and general cell coverage. While SnapATAC has an wrapper around ChromVAR that outputs the deviation matrix, I just take the code from that function and run every step separately to keep the useful outputs and statistics of chromVAR.
snap.pmat = makeBinary(snap.pmat, "pmat")
obj = snap.pmat
input.mat="pmat"
min.count=10
species="Homo sapiens"
genome=BSgenome.Hsapiens.UCSC.hg38
data.use = obj@pmat
peak.use = obj@peak
ncell = nrow(data.use)
idy = which(Matrix::colSums(data.use) >= min.count)
data.use = data.use[,idy,dropping=TRUE]
peak.use = peak.use[idy]
rse <- SummarizedExperiment(
assays = list(counts = t(data.use)),
rowRanges = peak.use,
colData = DataFrame(Cell_Type=1:nrow(data.use), depth=Matrix::rowSums(data.use))
);
rse <- addGCBias(rse, genome = genome);
motifs <- getJasparMotifs(collection = "CORE", species=species)
motif_mm <- matchMotifs(motifs, rse, genome = genome);
dev <- computeDeviations(object = rse, annotations = motif_mm);
var <- computeVariability(dev)
Save
rowData(dev) %<>%
as.tibble(rownames="motif") %>%
full_join(var) %>%
column_to_rownames('motif') %>%
DataFrame()
saveRDS(dev, "~/my_data/Tcells_peaks/Tcells_chromVarOutput.RDS")
var %>%
mutate(signif=ifelse(p_value_adj < 0.01, "signif", NA)) %>%
mutate(rank=rank(-variability)) %>%
ggplot(aes(rank, variability, color=signif)) +
geom_point() +
ggrepel::geom_text_repel(data=. %>% filter(rank < 80 & rank > 50), aes(label=name)) +
geom_vline(xintercept = 50)
Visualize deviation scores of the most variable motifs, ordered in pseudotime.
sample_dpt_bins.df <- dpt.df %>%
mutate(cell=str_remove(cell, "^ATAC_")) %>%
filter(tech=="ATAC") %>%
arrange(dpt_pseudotime)
motif.topvar <- var %>% rownames_to_column("motif") %>% top_n(50,variability) %>% pull(motif)
tf.topvar <- motif.topvar %>% str_remove(".+_") %>% str_remove("\\(.+|:.+")
mmat.topvar <- dev@assays$data$z[motif.topvar,sample_dpt_bins.df$cell]
rownames(mmat.topvar) <- tf.topvar
smooth.mmat <- apply(mmat.topvar, 1, function(x) zoo::rollmean(x, k=30)) %>% t()
tiff(paste0(outdir, "chromVAR_motif_heatmap.tiff"), width=900, height = 900)
# pdf(paste0(outdir, "chromVAR_motif_heatmap.pdf"), width=9, height = 10)
smooth.mmat %>%
# mmat.topvar[,sample_dpt_bins.df$cell] %>%
pheatmap::pheatmap(show_colnames = F, cluster_cols = F, clustering_distance_rows = "correlation",
annotation_col = sample_dpt_bins.df[,c("cell", "annotation", "dpt_pseudotime")] %>% column_to_rownames("cell"),
annotation_colors = list(annotation=cell.type.pal, dpt_pseudotime=viridis::viridis(100)), fontsize = 18, fontsize_row = 14,
# color = colorRampPalette(rev(brewer.pal(n = 7, name ="Spectral")))(100))
breaks=seq(-3,3, length.out = 100), legend = F, annotation_legend = F,
legend_breaks = c(-2, -1, 0, 1, 2, 3), legend_labels = c("-2", "-1", "0", "1","2", "Deviation\n(z-score)")
)
dev.off()
Compare motif accessibility trend with gene expression trend along pseudotime. I find both examples of correlation between accessibility and TF expression (e.g. RUNX2, ELK3) and anti-correlation (e.g. JUN, ETV6).
counts.topvar <- coembed@assays$RNA@data[tf.topvar[which(tf.topvar %in% rownames(coembed@assays$RNA@data))],dpt.order$cell]
gex.df <-
reshape2::melt(as.matrix(counts.topvar), varnames=c("gene", "cell")) %>%
# rowid_to_column("dpt_order") %>%
mutate(data="Gene\nexpression")
mmat.df <- reshape2::melt(mmat.topvar, varnames=c("gene", "cell")) %>%
# rowid_to_column("dpt_order") %>%
mutate(cell=str_c("ATAC_", cell)) %>%
mutate(data="Motif\ndeviation")
plot.tfs <- function(plot.tfs){
bind_rows(gex.df, mmat.df) %>%
left_join(dpt.df[, c("cell", "dpt_pseudotime", "annotation")], by="cell") %>%
mutate(dpt_rank=dense_rank(dpt_pseudotime)) %>%
drop_na(dpt_pseudotime) %>%
mutate(data=factor(data, levels=c("Gene\nexpression", "smooth", "Motif\ndeviation"))) %>%
filter(gene %in% plot.tfs) %>%
ggplot(aes(dpt_rank, value, color=data)) +
# geom_point(data=. %>% filter(data!="smooth"), aes(color=annotation), size=0.7, alpha=0.3) +
geom_smooth( method="loess",span=0.2) +
facet_grid(data~gene, scales="free") +
xlab("Pseudotime rank") +
theme_bw(base_size = 16)
}
tfs <- c("JUN", "FOSL2", "FOSL1", "FOS")
pdf(paste0(outdir, "TF_plots.pdf"), width = 8, height = 5)
for (tf in tf.topvar) {
print(plot.tfs(tf))
}
dev.off()
map(list("JUN", "ELK3", "RUNX2", "REL", "FOS", "ETV6", "TCF3"), ~ plot.tfs(.x) + ggsave(paste0(outdir, paste0('TF_plot_',.x,".png")), width = 8, height=5))
tfs <- c("RUNX2", "ELK3","JUN", "ETV6")
plot.tfs(tfs)
tf.list <- map(tfs, ~ plot.tfs(.x))
tf.list <- map(tf.list, ~ .x + theme(legend.position = "none"))
tf.list <- map_if(tf.list, c(TRUE, TRUE, TRUE, FALSE), ~ .x + theme(axis.title.x = element_blank(), axis.ticks.x = element_blank(), axis.text.x=element_blank() ))
tf.list
plot_grid(plotlist = tf.list, align="v", axis="l", nrow=4, ncol=1, rel_heights = c(1,1,1,1.15)) +
ggsave(paste0(outdir, "TF_pl_fig.png"), height = 11, width = 5)
plot.tfs("TFAP4")
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding factor and character vector, coercing into character vectorbinding character and factor vector, coercing into character vector
–>
–> –> –> –> –> –> –> –>
–> –> –> –> –> –> –>
–> –> –> –> –>
–> –> –> –>
–> –> –>